1 Libraries, load data

Load requred libraries, load data and move factor columns ahead.

library(readxl)
library(tidyverse)
library(car)
library(Hmisc)
library(multcomp)
library(gridExtra)
library(vegan)
library(ggpubr)
library(plotly)
library(DESeq2)



setwd('/home/alexey/BI/R/Project_3/')
download.file(url='https://archive.ics.uci.edu/ml/machine-learning-databases/00342/Data_Cortex_Nuclear.xls',
              destfile='Data_Cortex_Nuclear.xls')
data <- read_xls('Data_Cortex_Nuclear.xls')

# Factors first
data <- data %>% 
  relocate(where(is.numeric), .after = where(is.character)) %>% 
  mutate(across(where(is.character), as.factor))

data[1:10,1:10]
## # A tibble: 10 x 10
##    MouseID Genotype Treatment Behavior class DYRK1A_N ITSN1_N BDNF_N NR1_N
##    <fct>   <fct>    <fct>     <fct>    <fct>    <dbl>   <dbl>  <dbl> <dbl>
##  1 309_1   Control  Memantine C/S      c-CS…    0.504   0.747  0.430  2.82
##  2 309_2   Control  Memantine C/S      c-CS…    0.515   0.689  0.412  2.79
##  3 309_3   Control  Memantine C/S      c-CS…    0.509   0.730  0.418  2.69
##  4 309_4   Control  Memantine C/S      c-CS…    0.442   0.617  0.359  2.47
##  5 309_5   Control  Memantine C/S      c-CS…    0.435   0.617  0.359  2.37
##  6 309_6   Control  Memantine C/S      c-CS…    0.448   0.628  0.367  2.39
##  7 309_7   Control  Memantine C/S      c-CS…    0.428   0.574  0.343  2.33
##  8 309_8   Control  Memantine C/S      c-CS…    0.417   0.564  0.328  2.26
##  9 309_9   Control  Memantine C/S      c-CS…    0.386   0.538  0.318  2.13
## 10 309_10  Control  Memantine C/S      c-CS…    0.381   0.499  0.362  2.10
## # … with 1 more variable: NR2A_N <dbl>

2 Describe dataset

So, what is our data?

  • Proteins! Lots of proteins! More precisely, we have 77 different proteins’ levels.
  • MouseID is a uniq ID for every mouse: length is 1080 in 1080 levels. Not really useful for us.
  • Genotype is a genotype of the mouse. It has 2 levels, “Control” for normal mice (82 mice), and “Ts65Dn” as model line for Down syndrome mice (82 mice)
  • Treatment is a description of treatment for mice. Also it has 2 levels, “Memantine” (82 mice) and “Saline” (82 mice) treatment
  • Behavior some mice have been stimulated to learn (context-shock, or “C/S” - 82 mice) and others have not (shock-context, “S/C” - 82 mice)
  • class is the most interesting column, described three previous groups in one. It combines from three factors: control (c) or trisomy (t); context-shock (CS) or shock-context (SC); memantine (m) or saline (s) treatment. Afterwards we will use it for analysis, so, look at it closely:
res <- merge(data %>% group_by(class) %>% summarise(Total_entries = n(), .groups = "keep"),
             na.omit(data) %>% group_by(class) %>% summarise(Entries_without_NA = n(), .groups = "keep"),
             by = "class")
res$class <- as.character(res$class)

res %>%  rbind(c('All classes', nrow(data), nrow(na.omit(data))))
##         class Total_entries Entries_without_NA
## 1      c-CS-m           150                 45
## 2      c-CS-s           135                 75
## 3      c-SC-m           150                 60
## 4      c-SC-s           135                 75
## 5      t-CS-m           135                 90
## 6      t-CS-s           105                 75
## 7      t-SC-m           135                 60
## 8      t-SC-s           135                 72
## 9 All classes          1080                552

So, roughly half of our data contain one or more NAs in protein levels; every class have from 135 to 150 mice, including 45 to 90 mice without any NA in 77 proteins.


3 Is there any differences in BDNF_N in different classes

For this question, we should use one-way variance analysis. Filter data from NA and see at our abundances.

aov_data <- data %>% dplyr::select(class, BDNF_N) %>% na.omit
aov_data %>% group_by(class) %>% summarise(n = n(), .groups = "keep")
## # A tibble: 8 x 2
## # Groups:   class [8]
##   class      n
##   <fct>  <int>
## 1 c-CS-m   150
## 2 c-CS-s   135
## 3 c-SC-m   150
## 4 c-SC-s   135
## 5 t-CS-m   135
## 6 t-CS-s   105
## 7 t-SC-m   135
## 8 t-SC-s   132

105 and 150 looks non-equal, but we don’t have strict criteria for number of elements, ant it is not more than 1.5 times, so, go ahead with this data.

3.1 AOV

aov_fit <- aov(BDNF_N ~ class, data = aov_data)
summary(aov_fit)
##               Df Sum Sq Mean Sq F value Pr(>F)    
## class          7 0.2878 0.04112   18.82 <2e-16 ***
## Residuals   1069 2.3362 0.00219                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Also check, can we use variance analysis at all. Check normality of residues and Cook’s distance

fortified_data <- fortify(aov_fit)

ggplot(fortified_data, aes(x = 1:nrow(fortified_data), y = .cooksd)) + geom_bar(stat = "identity") + ggtitle("Cook's distance plot")

ggplot(fortified_data, aes(x = class, y = .stdresid)) + geom_boxplot() + ggtitle("Residuals plot")

qqPlot(aov_fit, id = FALSE, main = "Residuals plot")

Ok, Cook’s distance is lower, than 0.015 (we havn’t outliers), and residuals distributed normally. We can use aov.

3.2 Post-hoc tests

Ok, there is significant difference in BDNF_N level in different classes. In which pairs exactly they are? Perform post-hoc tests:

post_hoch <- glht(aov_fit, linfct = mcp(class = "Tukey"))
summary(post_hoch)
## Warning in RET$pfunction("adjusted", ...): Completion with error > abseps

## Warning in RET$pfunction("adjusted", ...): Completion with error > abseps
## 
##   Simultaneous Tests for General Linear Hypotheses
## 
## Multiple Comparisons of Means: Tukey Contrasts
## 
## 
## Fit: aov(formula = BDNF_N ~ class, data = aov_data)
## 
## Linear Hypotheses:
##                        Estimate Std. Error t value Pr(>|t|)    
## c-CS-s - c-CS-m == 0  0.0030979  0.0055459   0.559  0.99930    
## c-SC-m - c-CS-m == 0 -0.0482717  0.0053980  -8.942  < 0.001 ***
## c-SC-s - c-CS-m == 0 -0.0258249  0.0055459  -4.657  < 0.001 ***
## t-CS-m - c-CS-m == 0 -0.0264852  0.0055459  -4.776  < 0.001 ***
## t-CS-s - c-CS-m == 0 -0.0337570  0.0059483  -5.675  < 0.001 ***
## t-SC-m - c-CS-m == 0 -0.0181541  0.0055459  -3.273  0.02420 *  
## t-SC-s - c-CS-m == 0 -0.0136310  0.0055790  -2.443  0.22105    
## c-SC-m - c-CS-s == 0 -0.0513696  0.0055459  -9.263  < 0.001 ***
## c-SC-s - c-CS-s == 0 -0.0289228  0.0056900  -5.083  < 0.001 ***
## t-CS-m - c-CS-s == 0 -0.0295831  0.0056900  -5.199  < 0.001 ***
## t-CS-s - c-CS-s == 0 -0.0368549  0.0060829  -6.059  < 0.001 ***
## t-SC-m - c-CS-s == 0 -0.0212520  0.0056900  -3.735  0.00486 ** 
## t-SC-s - c-CS-s == 0 -0.0167289  0.0057223  -2.923  0.06872 .  
## c-SC-s - c-SC-m == 0  0.0224468  0.0055459   4.047  0.00147 ** 
## t-CS-m - c-SC-m == 0  0.0217865  0.0055459   3.928  0.00222 ** 
## t-CS-s - c-SC-m == 0  0.0145147  0.0059483   2.440  0.22245    
## t-SC-m - c-SC-m == 0  0.0301176  0.0055459   5.431  < 0.001 ***
## t-SC-s - c-SC-m == 0  0.0346406  0.0055790   6.209  < 0.001 ***
## t-CS-m - c-SC-s == 0 -0.0006603  0.0056900  -0.116  1.00000    
## t-CS-s - c-SC-s == 0 -0.0079321  0.0060829  -1.304  0.89721    
## t-SC-m - c-SC-s == 0  0.0076708  0.0056900   1.348  0.87975    
## t-SC-s - c-SC-s == 0  0.0121939  0.0057223   2.131  0.39481    
## t-CS-s - t-CS-m == 0 -0.0072718  0.0060829  -1.195  0.93310    
## t-SC-m - t-CS-m == 0  0.0083311  0.0056900   1.464  0.82588    
## t-SC-s - t-CS-m == 0  0.0128542  0.0057223   2.246  0.32408    
## t-SC-m - t-CS-s == 0  0.0156029  0.0060829   2.565  0.17016    
## t-SC-s - t-CS-s == 0  0.0201260  0.0061130   3.292  0.02269 *  
## t-SC-s - t-SC-m == 0  0.0045231  0.0057223   0.790  0.99360    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## (Adjusted p values reported -- single-step method)

3.3 Plot

And sure, draw the plot

ggplot(aov_data, aes(x = class, y = BDNF_N)) + geom_boxplot() + theme(axis.title.x=element_blank())


4 Make linear model for ERBB4_N according other proteins

Make full model for all proteins in dataset. Remove all NA-values, and make standartisation.

proteins <- data %>% keep(is.numeric)
proteins_scale <- as.data.frame(sapply(proteins, scale)) %>% na.omit()
formula <- as.formula(ERBB4_N ~ .)
fit <- lm(formula, data = proteins_scale)
summary(fit)
## 
## Call:
## lm(formula = formula, data = proteins_scale)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.5286 -0.2236 -0.0084  0.2038  2.1204 
## 
## Coefficients: (1 not defined because of singularities)
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -0.018686   0.030373  -0.615 0.538703    
## DYRK1A_N        -0.112916   0.123677  -0.913 0.361710    
## ITSN1_N          0.135706   0.176665   0.768 0.442776    
## BDNF_N           0.125598   0.062889   1.997 0.046378 *  
## NR1_N           -0.324746   0.154046  -2.108 0.035544 *  
## NR2A_N          -0.006494   0.084269  -0.077 0.938610    
## pAKT_N           0.219939   0.084104   2.615 0.009203 ** 
## pBRAF_N         -0.073621   0.055001  -1.339 0.181356    
## pCAMKII_N       -0.001247   0.068353  -0.018 0.985451    
## pCREB_N         -0.130058   0.062926  -2.067 0.039290 *  
## pELK_N           0.001466   0.031915   0.046 0.963391    
## pERK_N          -0.455159   0.180720  -2.519 0.012109 *  
## pJNK_N          -0.208138   0.087537  -2.378 0.017814 *  
## PKCA_N           0.029035   0.094839   0.306 0.759625    
## pMEK_N           0.026029   0.082902   0.314 0.753676    
## pNR1_N          -0.200848   0.104398  -1.924 0.054968 .  
## pNR2A_N          0.125305   0.090307   1.388 0.165927    
## pNR2B_N          0.346375   0.119623   2.896 0.003959 ** 
## pPKCAB_N         0.183064   0.088648   2.065 0.039457 *  
## pRSK_N           0.044267   0.063137   0.701 0.483573    
## AKT_N           -0.005697   0.069193  -0.082 0.934413    
## BRAF_N           0.222600   0.174199   1.278 0.201926    
## CAMKII_N        -0.011911   0.069865  -0.170 0.864700    
## CREB_N          -0.025378   0.054087  -0.469 0.639135    
## ELK_N            0.078202   0.082869   0.944 0.345811    
## ERK_N            0.202047   0.121582   1.662 0.097208 .  
## GSK3B_N         -0.094835   0.110826  -0.856 0.392588    
## JNK_N           -0.026715   0.075987  -0.352 0.725317    
## MEK_N            0.023943   0.059893   0.400 0.689510    
## TRKA_N           0.045878   0.131966   0.348 0.728257    
## RSK_N           -0.042932   0.073088  -0.587 0.557214    
## APP_N           -0.021694   0.074076  -0.293 0.769751    
## Bcatenin_N       0.032010   0.153167   0.209 0.834546    
## SOD1_N          -0.116180   0.074190  -1.566 0.118018    
## MTOR_N           0.180821   0.071743   2.520 0.012048 *  
## P38_N           -0.095941   0.079871  -1.201 0.230267    
## pMTOR_N         -0.077431   0.062293  -1.243 0.214475    
## DSCR1_N         -0.045716   0.068264  -0.670 0.503374    
## AMPKA_N          0.104715   0.095270   1.099 0.272263    
## NR2B_N           0.043254   0.053527   0.808 0.419449    
## pNUMB_N         -0.027045   0.083900  -0.322 0.747326    
## RAPTOR_N         0.170702   0.088604   1.927 0.054629 .  
## TIAM1_N         -0.146606   0.087340  -1.679 0.093892 .  
## pP70S6_N         0.028382   0.054994   0.516 0.606022    
## NUMB_N          -0.064220   0.073553  -0.873 0.383042    
## P70S6_N         -0.049965   0.058704  -0.851 0.395118    
## pGSK3B_N         0.165450   0.052053   3.179 0.001577 ** 
## pPKCG_N         -0.293093   0.075289  -3.893 0.000113 ***
## CDK5_N           0.001764   0.024323   0.073 0.942225    
## S6_N             0.142854   0.069736   2.049 0.041059 *  
## ADARB1_N        -0.062631   0.048972  -1.279 0.201557    
## AcetylH3K9_N     0.025985   0.135676   0.192 0.848199    
## RRP1_N          -0.031502   0.021968  -1.434 0.152232    
## BAX_N           -0.162854   0.044251  -3.680 0.000260 ***
## ARC_N            0.159773   0.063924   2.499 0.012775 *  
## nNOS_N          -0.007853   0.047195  -0.166 0.867918    
## Tau_N            0.324281   0.082865   3.913 0.000104 ***
## GFAP_N          -0.041293   0.047418  -0.871 0.384291    
## GluR3_N         -0.016512   0.056395  -0.293 0.769812    
## GluR4_N         -0.125414   0.060173  -2.084 0.037672 *  
## IL1B_N           0.154896   0.058877   2.631 0.008794 ** 
## P3525_N          0.099458   0.048249   2.061 0.039810 *  
## pCASP9_N         0.133665   0.050126   2.667 0.007923 ** 
## PSD95_N          0.401534   0.064325   6.242 9.55e-10 ***
## SNCA_N          -0.018225   0.055822  -0.326 0.744195    
## Ubiquitin_N      0.026834   0.056238   0.477 0.633468    
## pGSK3B_Tyr216_N  0.175747   0.062933   2.793 0.005439 ** 
## SHH_N            0.094333   0.038435   2.454 0.014471 *  
## BAD_N           -0.070404   0.068748  -1.024 0.306311    
## BCL2_N           0.011231   0.056615   0.198 0.842840    
## pS6_N                  NA         NA      NA       NA    
## pCFOS_N         -0.019510   0.049260  -0.396 0.692240    
## SYP_N            0.059670   0.047577   1.254 0.210395    
## H3AcK18_N        0.009463   0.099598   0.095 0.924349    
## EGR1_N          -0.016129   0.070213  -0.230 0.818406    
## H3MeK4_N         0.045206   0.079340   0.570 0.569097    
## CaNA_N          -0.036804   0.071937  -0.512 0.609159    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3905 on 476 degrees of freedom
## Multiple R-squared:  0.8641, Adjusted R-squared:  0.8427 
## F-statistic: 40.37 on 75 and 476 DF,  p-value: < 2.2e-16

NAs for pS6_N protein is strange - perhaps, this one aliased by another predictor?

attributes(alias(fit)$Complete)$dimnames[[1]]
## [1] "pS6_N"
apply(proteins_scale, 2, function(col)cor(col, proteins_scale$pS6_N))
##        DYRK1A_N         ITSN1_N          BDNF_N           NR1_N          NR2A_N 
##    -0.363711184    -0.223362718     0.242241256     0.404419650     0.372721618 
##          pAKT_N         pBRAF_N       pCAMKII_N         pCREB_N          pELK_N 
##     0.434484415     0.355164671     0.475099641     0.440425689    -0.034131024 
##          pERK_N          pJNK_N          PKCA_N          pMEK_N          pNR1_N 
##    -0.388125679     0.484425623     0.033929900     0.490697989     0.447917932 
##         pNR2A_N         pNR2B_N        pPKCAB_N          pRSK_N           AKT_N 
##     0.625902235     0.494241838    -0.300561178     0.008747293     0.624537450 
##          BRAF_N        CAMKII_N          CREB_N           ELK_N           ERK_N 
##    -0.379818814     0.426300117     0.260584566     0.358499140     0.274766842 
##         GSK3B_N           JNK_N           MEK_N          TRKA_N           RSK_N 
##    -0.167603930     0.218392878     0.418963882     0.279564901     0.224140611 
##           APP_N      Bcatenin_N          SOD1_N          MTOR_N           P38_N 
##     0.072285884     0.431648030     0.615216495     0.428779533     0.476743809 
##         pMTOR_N         DSCR1_N         AMPKA_N          NR2B_N         pNUMB_N 
##     0.568276940     0.282901796     0.375464887     0.500962579    -0.059493121 
##        RAPTOR_N         TIAM1_N        pP70S6_N          NUMB_N         P70S6_N 
##     0.328641375     0.296692236    -0.028621124     0.115344538     0.400447005 
##        pGSK3B_N         pPKCG_N          CDK5_N            S6_N        ADARB1_N 
##    -0.100835693    -0.181076929     0.034639173    -0.149459188     0.218258161 
##    AcetylH3K9_N          RRP1_N           BAX_N           ARC_N         ERBB4_N 
##     0.080345273     0.041362658     0.374905849     1.000000000     0.708290886 
##          nNOS_N           Tau_N          GFAP_N         GluR3_N         GluR4_N 
##     0.683079589     0.234490417    -0.085638927     0.182006656     0.243438877 
##          IL1B_N         P3525_N        pCASP9_N         PSD95_N          SNCA_N 
##     0.618194657     0.355734908     0.452368140     0.651626726     0.600497580 
##     Ubiquitin_N pGSK3B_Tyr216_N           SHH_N           BAD_N          BCL2_N 
##     0.705410687     0.044204014     0.383026252    -0.021234815     0.205289439 
##           pS6_N         pCFOS_N           SYP_N       H3AcK18_N          EGR1_N 
##     1.000000000     0.055734370     0.395031542     0.203910599     0.224491255 
##        H3MeK4_N          CaNA_N 
##     0.199488168    -0.374123955

Yep. ARC_N and pS6_N are fully correlated, so, they are aliases. We haven’t any information about significance of every protein, so, drop last one - pS6_N - and revaluate the model.

fit <- update(fit, . ~ . -pS6_N)
summary(fit)
## 
## Call:
## lm(formula = ERBB4_N ~ DYRK1A_N + ITSN1_N + BDNF_N + NR1_N + 
##     NR2A_N + pAKT_N + pBRAF_N + pCAMKII_N + pCREB_N + pELK_N + 
##     pERK_N + pJNK_N + PKCA_N + pMEK_N + pNR1_N + pNR2A_N + pNR2B_N + 
##     pPKCAB_N + pRSK_N + AKT_N + BRAF_N + CAMKII_N + CREB_N + 
##     ELK_N + ERK_N + GSK3B_N + JNK_N + MEK_N + TRKA_N + RSK_N + 
##     APP_N + Bcatenin_N + SOD1_N + MTOR_N + P38_N + pMTOR_N + 
##     DSCR1_N + AMPKA_N + NR2B_N + pNUMB_N + RAPTOR_N + TIAM1_N + 
##     pP70S6_N + NUMB_N + P70S6_N + pGSK3B_N + pPKCG_N + CDK5_N + 
##     S6_N + ADARB1_N + AcetylH3K9_N + RRP1_N + BAX_N + ARC_N + 
##     nNOS_N + Tau_N + GFAP_N + GluR3_N + GluR4_N + IL1B_N + P3525_N + 
##     pCASP9_N + PSD95_N + SNCA_N + Ubiquitin_N + pGSK3B_Tyr216_N + 
##     SHH_N + BAD_N + BCL2_N + pCFOS_N + SYP_N + H3AcK18_N + EGR1_N + 
##     H3MeK4_N + CaNA_N, data = proteins_scale)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.5286 -0.2236 -0.0084  0.2038  2.1204 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -0.018686   0.030373  -0.615 0.538703    
## DYRK1A_N        -0.112916   0.123677  -0.913 0.361710    
## ITSN1_N          0.135706   0.176665   0.768 0.442776    
## BDNF_N           0.125598   0.062889   1.997 0.046378 *  
## NR1_N           -0.324746   0.154046  -2.108 0.035544 *  
## NR2A_N          -0.006494   0.084269  -0.077 0.938610    
## pAKT_N           0.219939   0.084104   2.615 0.009203 ** 
## pBRAF_N         -0.073621   0.055001  -1.339 0.181356    
## pCAMKII_N       -0.001247   0.068353  -0.018 0.985451    
## pCREB_N         -0.130058   0.062926  -2.067 0.039290 *  
## pELK_N           0.001466   0.031915   0.046 0.963391    
## pERK_N          -0.455159   0.180720  -2.519 0.012109 *  
## pJNK_N          -0.208138   0.087537  -2.378 0.017814 *  
## PKCA_N           0.029035   0.094839   0.306 0.759625    
## pMEK_N           0.026029   0.082902   0.314 0.753676    
## pNR1_N          -0.200848   0.104398  -1.924 0.054968 .  
## pNR2A_N          0.125305   0.090307   1.388 0.165927    
## pNR2B_N          0.346375   0.119623   2.896 0.003959 ** 
## pPKCAB_N         0.183064   0.088648   2.065 0.039457 *  
## pRSK_N           0.044267   0.063137   0.701 0.483573    
## AKT_N           -0.005697   0.069193  -0.082 0.934413    
## BRAF_N           0.222600   0.174199   1.278 0.201926    
## CAMKII_N        -0.011911   0.069865  -0.170 0.864700    
## CREB_N          -0.025378   0.054087  -0.469 0.639135    
## ELK_N            0.078202   0.082869   0.944 0.345811    
## ERK_N            0.202047   0.121582   1.662 0.097208 .  
## GSK3B_N         -0.094835   0.110826  -0.856 0.392588    
## JNK_N           -0.026715   0.075987  -0.352 0.725317    
## MEK_N            0.023943   0.059893   0.400 0.689510    
## TRKA_N           0.045878   0.131966   0.348 0.728257    
## RSK_N           -0.042932   0.073088  -0.587 0.557214    
## APP_N           -0.021694   0.074076  -0.293 0.769751    
## Bcatenin_N       0.032010   0.153167   0.209 0.834546    
## SOD1_N          -0.116180   0.074190  -1.566 0.118018    
## MTOR_N           0.180821   0.071743   2.520 0.012048 *  
## P38_N           -0.095941   0.079871  -1.201 0.230267    
## pMTOR_N         -0.077431   0.062293  -1.243 0.214475    
## DSCR1_N         -0.045716   0.068264  -0.670 0.503374    
## AMPKA_N          0.104715   0.095270   1.099 0.272263    
## NR2B_N           0.043254   0.053527   0.808 0.419449    
## pNUMB_N         -0.027045   0.083900  -0.322 0.747326    
## RAPTOR_N         0.170702   0.088604   1.927 0.054629 .  
## TIAM1_N         -0.146606   0.087340  -1.679 0.093892 .  
## pP70S6_N         0.028382   0.054994   0.516 0.606022    
## NUMB_N          -0.064220   0.073553  -0.873 0.383042    
## P70S6_N         -0.049965   0.058704  -0.851 0.395118    
## pGSK3B_N         0.165450   0.052053   3.179 0.001577 ** 
## pPKCG_N         -0.293093   0.075289  -3.893 0.000113 ***
## CDK5_N           0.001764   0.024323   0.073 0.942225    
## S6_N             0.142854   0.069736   2.049 0.041059 *  
## ADARB1_N        -0.062631   0.048972  -1.279 0.201557    
## AcetylH3K9_N     0.025985   0.135676   0.192 0.848199    
## RRP1_N          -0.031502   0.021968  -1.434 0.152232    
## BAX_N           -0.162854   0.044251  -3.680 0.000260 ***
## ARC_N            0.159773   0.063924   2.499 0.012775 *  
## nNOS_N          -0.007853   0.047195  -0.166 0.867918    
## Tau_N            0.324281   0.082865   3.913 0.000104 ***
## GFAP_N          -0.041293   0.047418  -0.871 0.384291    
## GluR3_N         -0.016512   0.056395  -0.293 0.769812    
## GluR4_N         -0.125414   0.060173  -2.084 0.037672 *  
## IL1B_N           0.154896   0.058877   2.631 0.008794 ** 
## P3525_N          0.099458   0.048249   2.061 0.039810 *  
## pCASP9_N         0.133665   0.050126   2.667 0.007923 ** 
## PSD95_N          0.401534   0.064325   6.242 9.55e-10 ***
## SNCA_N          -0.018225   0.055822  -0.326 0.744195    
## Ubiquitin_N      0.026834   0.056238   0.477 0.633468    
## pGSK3B_Tyr216_N  0.175747   0.062933   2.793 0.005439 ** 
## SHH_N            0.094333   0.038435   2.454 0.014471 *  
## BAD_N           -0.070404   0.068748  -1.024 0.306311    
## BCL2_N           0.011231   0.056615   0.198 0.842840    
## pCFOS_N         -0.019510   0.049260  -0.396 0.692240    
## SYP_N            0.059670   0.047577   1.254 0.210395    
## H3AcK18_N        0.009463   0.099598   0.095 0.924349    
## EGR1_N          -0.016129   0.070213  -0.230 0.818406    
## H3MeK4_N         0.045206   0.079340   0.570 0.569097    
## CaNA_N          -0.036804   0.071937  -0.512 0.609159    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3905 on 476 degrees of freedom
## Multiple R-squared:  0.8641, Adjusted R-squared:  0.8427 
## F-statistic: 40.37 on 75 and 476 DF,  p-value: < 2.2e-16

Next step - remove multicollinearity

vif(fit)
##        DYRK1A_N         ITSN1_N          BDNF_N           NR1_N          NR2A_N 
##       23.736607       59.473637       16.669610       96.311682       25.127918 
##          pAKT_N         pBRAF_N       pCAMKII_N         pCREB_N          pELK_N 
##       21.880134        9.671403       19.251803       17.421324        2.584965 
##          pERK_N          pJNK_N          PKCA_N          pMEK_N          pNR1_N 
##       55.739024       26.508561       38.653895       23.634116       44.380168 
##         pNR2A_N         pNR2B_N        pPKCAB_N          pRSK_N           AKT_N 
##       33.053974       55.826515       33.036076       15.913962       17.993694 
##          BRAF_N        CAMKII_N          CREB_N           ELK_N           ERK_N 
##       41.690503       16.062122        9.992444       29.844699       55.249434 
##         GSK3B_N           JNK_N           MEK_N          TRKA_N           RSK_N 
##       32.819856       18.997732       15.629309       72.480935       16.236937 
##           APP_N      Bcatenin_N          SOD1_N          MTOR_N           P38_N 
##       21.563500       91.994091       20.108411       17.860768       19.953844 
##         pMTOR_N         DSCR1_N         AMPKA_N          NR2B_N         pNUMB_N 
##       17.905757       14.567686       27.108067       11.017845       20.067495 
##        RAPTOR_N         TIAM1_N        pP70S6_N          NUMB_N         P70S6_N 
##       19.127590       21.793679        9.784679       23.482651       13.516220 
##        pGSK3B_N         pPKCG_N          CDK5_N            S6_N        ADARB1_N 
##       12.102837       22.052092        2.976573       17.503666        9.335931 
##    AcetylH3K9_N          RRP1_N           BAX_N           ARC_N          nNOS_N 
##       42.778299        1.716449        8.071101       14.624584        9.589675 
##           Tau_N          GFAP_N         GluR3_N         GluR4_N          IL1B_N 
##       17.829428        5.422842       10.719066        8.004983        9.575112 
##         P3525_N        pCASP9_N         PSD95_N          SNCA_N     Ubiquitin_N 
##        7.896508        9.082792       14.501625        9.291047       10.532632 
## pGSK3B_Tyr216_N           SHH_N           BAD_N          BCL2_N         pCFOS_N 
##       11.355079        5.101022       13.413137        8.480518        6.364879 
##           SYP_N       H3AcK18_N          EGR1_N        H3MeK4_N          CaNA_N 
##        9.429731       36.335690       13.981307       15.299243       19.739341

Lots and lots of collinear predictors. Bad for us. Try to make auto drop for every predictor until vif() will be less than 2.

refit <- function(df){
  ## God, I hate update() function: it works wrong, when you try to pass a name of 
  ## predictor to remove, and do not return an error, where it must. 
  predictors <- colnames(df)
  # remove alias and our protein
  predictors <- predictors[predictors != "ERBB4_N"]
  predictors <- predictors[predictors != "pS6_N"]
  # create lm
  fit <- lm(as.formula(paste("ERBB4_N ~ ", paste0(predictors, collapse="+"))), data = df)
  vif <- vif(fit)
  # predictor with highest VIF
  predictor <- vif[vif == max(vif)]
  while(predictor >= 2){
    print(paste(names(predictor), "- removed with VIF ", unname(predictor)))
    predictors <- predictors[predictors != names(predictor)]
    formula <- paste("ERBB4_N ~ ", paste(predictors, collapse="+"),sep = "")
    fit <- lm(formula, data = df)
    vif <- vif(fit)
    predictor <- vif[vif == max(vif(fit))]
  }
  predictors
}

good_predictors <- refit(proteins_scale)
## [1] "NR1_N - removed with VIF  96.3116818563355"
## [1] "Bcatenin_N - removed with VIF  80.185395754743"
## [1] "TRKA_N - removed with VIF  68.7125497229012"
## [1] "ITSN1_N - removed with VIF  57.1982298340348"
## [1] "pERK_N - removed with VIF  51.2405869584966"
## [1] "pNR2B_N - removed with VIF  46.8357130093968"
## [1] "AcetylH3K9_N - removed with VIF  41.5845752162506"
## [1] "ERK_N - removed with VIF  39.7205044147796"
## [1] "PKCA_N - removed with VIF  37.3065887090433"
## [1] "ELK_N - removed with VIF  28.116283092608"
## [1] "pNR1_N - removed with VIF  23.3735321503139"
## [1] "GSK3B_N - removed with VIF  22.6660059168382"
## [1] "AMPKA_N - removed with VIF  21.2365225537289"
## [1] "pJNK_N - removed with VIF  21.0481571552638"
## [1] "NUMB_N - removed with VIF  20.8675977435915"
## [1] "pPKCAB_N - removed with VIF  18.9821861663082"
## [1] "DYRK1A_N - removed with VIF  18.0752682287076"
## [1] "pAKT_N - removed with VIF  17.2691438495695"
## [1] "JNK_N - removed with VIF  16.7194557418703"
## [1] "RAPTOR_N - removed with VIF  16.2282384000947"
## [1] "MTOR_N - removed with VIF  15.8528103748328"
## [1] "CaNA_N - removed with VIF  15.4444949118682"
## [1] "NR2A_N - removed with VIF  15.343761866049"
## [1] "pMEK_N - removed with VIF  13.7359014796248"
## [1] "H3MeK4_N - removed with VIF  13.3509829243519"
## [1] "pPKCG_N - removed with VIF  13.0995724187498"
## [1] "BDNF_N - removed with VIF  12.9091231449963"
## [1] "TIAM1_N - removed with VIF  12.6995851122132"
## [1] "pMTOR_N - removed with VIF  12.6015242679877"
## [1] "ARC_N - removed with VIF  12.3383491154238"
## [1] "MEK_N - removed with VIF  11.923004066665"
## [1] "EGR1_N - removed with VIF  11.8167260554141"
## [1] "CAMKII_N - removed with VIF  10.6323500515232"
## [1] "pCREB_N - removed with VIF  10.3260375314912"
## [1] "pNR2A_N - removed with VIF  9.52985009727428"
## [1] "NR2B_N - removed with VIF  8.50935947978871"
## [1] "CREB_N - removed with VIF  7.90086554849432"
## [1] "Ubiquitin_N - removed with VIF  7.66557004242396"
## [1] "pRSK_N - removed with VIF  7.3979818344419"
## [1] "BAD_N - removed with VIF  7.13628175472155"
## [1] "Tau_N - removed with VIF  6.89207485191464"
## [1] "pNUMB_N - removed with VIF  6.54094589915653"
## [1] "S6_N - removed with VIF  6.37878140046559"
## [1] "pBRAF_N - removed with VIF  6.28454019125777"
## [1] "pGSK3B_N - removed with VIF  5.76002644992395"
## [1] "GluR3_N - removed with VIF  5.37980828624963"
## [1] "P70S6_N - removed with VIF  4.83423607678496"
## [1] "P38_N - removed with VIF  4.57682740619134"
## [1] "nNOS_N - removed with VIF  4.45584083654734"
## [1] "IL1B_N - removed with VIF  4.19530094810221"
## [1] "AKT_N - removed with VIF  4.04897235157331"
## [1] "BCL2_N - removed with VIF  3.81577483065807"
## [1] "BRAF_N - removed with VIF  3.71514077587936"
## [1] "PSD95_N - removed with VIF  3.35832664299129"
## [1] "P3525_N - removed with VIF  3.20195587736889"
## [1] "SYP_N - removed with VIF  3.08932735156774"
## [1] "pCFOS_N - removed with VIF  2.83206948086796"
## [1] "BAX_N - removed with VIF  2.55337975610498"
## [1] "GluR4_N - removed with VIF  2.53439354905622"
## [1] "APP_N - removed with VIF  2.3867554632591"
## [1] "pGSK3B_Tyr216_N - removed with VIF  2.06324072435886"
print(good_predictors)
##  [1] "pCAMKII_N" "pELK_N"    "RSK_N"     "SOD1_N"    "DSCR1_N"   "pP70S6_N" 
##  [7] "CDK5_N"    "ADARB1_N"  "RRP1_N"    "GFAP_N"    "pCASP9_N"  "SNCA_N"   
## [13] "SHH_N"     "H3AcK18_N"

Recalculate reduced LM with good predictors only

formula <- paste("ERBB4_N ~ ", paste(good_predictors, collapse="+"),sep = "")
fit_red <- lm(formula, data = proteins_scale)

summary(fit_red)
## 
## Call:
## lm(formula = formula, data = proteins_scale)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.13486 -0.35001  0.01093  0.34896  2.30041 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.05471    0.02751  -1.989  0.04724 *  
## pCAMKII_N    0.06976    0.03160   2.207  0.02772 *  
## pELK_N       0.04066    0.03405   1.194  0.23291    
## RSK_N        0.19523    0.03548   5.502 5.81e-08 ***
## SOD1_N       0.07818    0.03230   2.421  0.01582 *  
## DSCR1_N     -0.06860    0.03495  -1.963  0.05019 .  
## pP70S6_N    -0.06879    0.03473  -1.981  0.04814 *  
## CDK5_N       0.06013    0.02689   2.236  0.02575 *  
## ADARB1_N     0.28033    0.02841   9.867  < 2e-16 ***
## RRP1_N      -0.02269    0.02807  -0.808  0.41922    
## GFAP_N      -0.14566    0.04048  -3.598  0.00035 ***
## pCASP9_N     0.43074    0.03050  14.121  < 2e-16 ***
## SNCA_N       0.12312    0.03738   3.294  0.00105 ** 
## SHH_N        0.25811    0.03022   8.540  < 2e-16 ***
## H3AcK18_N    0.27615    0.03124   8.840  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5753 on 537 degrees of freedom
## Multiple R-squared:  0.6672, Adjusted R-squared:  0.6585 
## F-statistic: 76.91 on 14 and 537 DF,  p-value: < 2.2e-16
vif(fit_red)
## pCAMKII_N    pELK_N     RSK_N    SOD1_N   DSCR1_N  pP70S6_N    CDK5_N  ADARB1_N 
##  1.895677  1.354908  1.762407  1.755229  1.759048  1.797288  1.675283  1.447068 
##    RRP1_N    GFAP_N  pCASP9_N    SNCA_N     SHH_N H3AcK18_N 
##  1.290564  1.820407  1.549181  1.918910  1.452835  1.646402

Remove non-significant and predictors until all of them are significant at 0.05 or less (one by one, but in code next it done in bulk for shortage)

fit_red <- fit_red %>% update(.~.-pELK_N -RRP1_N)
fit_red <- fit_red %>% update(.~.-DSCR1_N -pCAMKII_N -pP70S6_N)
summary(fit_red)
## 
## Call:
## lm(formula = ERBB4_N ~ RSK_N + SOD1_N + CDK5_N + ADARB1_N + GFAP_N + 
##     pCASP9_N + SNCA_N + SHH_N + H3AcK18_N, data = proteins_scale)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.15625 -0.34013 -0.00287  0.35036  2.40904 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.04841    0.02711  -1.785  0.07476 .  
## RSK_N        0.17514    0.02975   5.888 6.88e-09 ***
## SOD1_N       0.08899    0.03220   2.764  0.00591 ** 
## CDK5_N       0.05865    0.02627   2.232  0.02602 *  
## ADARB1_N     0.29353    0.02612  11.239  < 2e-16 ***
## GFAP_N      -0.16036    0.03786  -4.236 2.67e-05 ***
## pCASP9_N     0.45504    0.02786  16.334  < 2e-16 ***
## SNCA_N       0.10454    0.03563   2.934  0.00348 ** 
## SHH_N        0.25409    0.02987   8.508  < 2e-16 ***
## H3AcK18_N    0.26532    0.02999   8.848  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5783 on 542 degrees of freedom
## Multiple R-squared:  0.6607, Adjusted R-squared:  0.6551 
## F-statistic: 117.3 on 9 and 542 DF,  p-value: < 2.2e-16

Our model is not good (explain just 0.66 of all variance), but let’s test it - perhapse, it isn’t useful at all.

4.1 Test model

fortified_fit_red <- data.frame(fortify(fit_red), proteins_scale)

gg_resid <- ggplot(data = fortified_fit_red, aes(x = .fitted, y = .stdresid)) + 
  geom_point() + 
  geom_hline(yintercept = 0) +
  geom_smooth(method = "lm") +
  geom_hline(yintercept = 2, color = "red") +
  geom_hline(yintercept = -2, color = "red")
gg_resid
## `geom_smooth()` using formula 'y ~ x'

Residuals demonstrate heavy outliers. Model should be corrected (perhaps, add back some of discarded predictors)

ggplot(fortified_fit_red, aes(x = 1:nrow(fortified_fit_red), y = .cooksd)) + 
  geom_bar(stat = "identity") + 
  geom_hline(yintercept = 2, color = "red")

Ok, no overweighted values

qqPlot(fortified_fit_red$.stdresid)

## [1] 37 88

Not very good, but looks like

predictors <- colnames(proteins_scale %>% dplyr::select(-pS6_N, -ERBB4_N))
dropped_predictors <- predictors[!(predictors %in% good_predictors)]
draw_it <- function(i){
  gg_resid + aes(x = fortified_fit_red[,i]) + xlab(i) + theme(axis.title.y=element_blank())
}
plots <- lapply(dropped_predictors, draw_it)
ggarrange(plotlist = plots, ncol = 4, nrow = 3)
## $`1`

## 
## $`2`

## 
## $`3`

## 
## $`4`

## 
## $`5`

## 
## $`6`

## 
## attr(,"class")
## [1] "list"      "ggarrange"

Okay, there is correlation with several excluded predictors (for example, PSD95_N, pCFOS_N and other), which should be included back to model again. But we will not do it here.

4.2 Should we use LM or not?

We have lots of multicollinear predictors in a dataset. But even we drop 3/4 of them, we have model with quite large residuals, and our best result explain about 0.66 of all variability - so, I think, LM isn’t good choice.


5 PCA

Make an ordinations

pca <- rda(proteins_scale)
head(summary(pca))
## 
## Call:
## rda(X = proteins_scale) 
## 
## Partitioning of variance:
##               Inertia Proportion
## Total           72.73          1
## Unconstrained   72.73          1
## 
## Eigenvalues, and their contribution to the variance 
## 
## Importance of components:
##                           PC1     PC2     PC3     PC4     PC5     PC6     PC7
## Eigenvalue            23.7479 11.7196 7.13238 6.60687 2.70290 2.68669 2.23181
## Proportion Explained   0.3265  0.1611 0.09806 0.09084 0.03716 0.03694 0.03068
## Cumulative Proportion  0.3265  0.4876 0.58570 0.67654 0.71370 0.75064 0.78132
##                           PC8     PC9    PC10    PC11    PC12    PC13     PC14
## Eigenvalue            1.74769 1.65562 1.24899 1.05229 0.95354 0.87466 0.721917
## Proportion Explained  0.02403 0.02276 0.01717 0.01447 0.01311 0.01203 0.009926
## Cumulative Proportion 0.80535 0.82812 0.84529 0.85976 0.87287 0.88489 0.894817
##                           PC15     PC16     PC17     PC18     PC19     PC20
## Eigenvalue            0.590981 0.549009 0.468580 0.457849 0.428005 0.398369
## Proportion Explained  0.008125 0.007548 0.006442 0.006295 0.005885 0.005477
## Cumulative Proportion 0.902942 0.910490 0.916933 0.923228 0.929112 0.934590
##                           PC21     PC22     PC23     PC24     PC25     PC26
## Eigenvalue            0.358522 0.329703 0.304326 0.284594 0.229786 0.219260
## Proportion Explained  0.004929 0.004533 0.004184 0.003913 0.003159 0.003015
## Cumulative Proportion 0.939519 0.944052 0.948236 0.952149 0.955308 0.958323
##                           PC27     PC28     PC29     PC30     PC31     PC32
## Eigenvalue            0.202036 0.192831 0.161839 0.159576 0.141616 0.127370
## Proportion Explained  0.002778 0.002651 0.002225 0.002194 0.001947 0.001751
## Cumulative Proportion 0.961101 0.963752 0.965977 0.968171 0.970118 0.971869
##                           PC33     PC34     PC35     PC36    PC37    PC38
## Eigenvalue            0.123317 0.119153 0.111057 0.103473 0.09236 0.08948
## Proportion Explained  0.001695 0.001638 0.001527 0.001423 0.00127 0.00123
## Cumulative Proportion 0.973565 0.975203 0.976730 0.978152 0.97942 0.98065
##                           PC39     PC40     PC41     PC42      PC43      PC44
## Eigenvalue            0.084482 0.080133 0.076943 0.076131 0.0688252 0.0659858
## Proportion Explained  0.001162 0.001102 0.001058 0.001047 0.0009463 0.0009072
## Cumulative Proportion 0.981814 0.982916 0.983974 0.985020 0.9859666 0.9868738
##                            PC45      PC46      PC47      PC48      PC49
## Eigenvalue            0.0633949 0.0627262 0.0593197 0.0526344 0.0510792
## Proportion Explained  0.0008716 0.0008624 0.0008156 0.0007237 0.0007023
## Cumulative Proportion 0.9877455 0.9886079 0.9894234 0.9901471 0.9908494
##                            PC50      PC51      PC52      PC53      PC54
## Eigenvalue            0.0479737 0.0452658 0.0441243 0.0428464 0.0409369
## Proportion Explained  0.0006596 0.0006224 0.0006067 0.0005891 0.0005628
## Cumulative Proportion 0.9915090 0.9921313 0.9927380 0.9933271 0.9938899
##                            PC55      PC56      PC57      PC58      PC59
## Eigenvalue            0.0367938 0.0359695 0.0340511 0.0325540 0.0308476
## Proportion Explained  0.0005059 0.0004945 0.0004682 0.0004476 0.0004241
## Cumulative Proportion 0.9943958 0.9948903 0.9953585 0.9958061 0.9962302
##                            PC60      PC61      PC62      PC63      PC64
## Eigenvalue            0.0291207 0.0266732 0.0252606 0.0226115 0.0215873
## Proportion Explained  0.0004004 0.0003667 0.0003473 0.0003109 0.0002968
## Cumulative Proportion 0.9966306 0.9969973 0.9973446 0.9976555 0.9979523
##                            PC65      PC66      PC67      PC68      PC69
## Eigenvalue            0.0191436 0.0188885 0.0180865 0.0165203 0.0146492
## Proportion Explained  0.0002632 0.0002597 0.0002487 0.0002271 0.0002014
## Cumulative Proportion 0.9982155 0.9984752 0.9987239 0.9989510 0.9991524
##                            PC70     PC71      PC72      PC73      PC74
## Eigenvalue            0.0127597 0.012004 0.0097787 0.0094075 6.621e-03
## Proportion Explained  0.0001754 0.000165 0.0001344 0.0001293 9.103e-05
## Cumulative Proportion 0.9993279 0.999493 0.9996273 0.9997567 9.998e-01
##                            PC75      PC76
## Eigenvalue            6.205e-03 4.871e-03
## Proportion Explained  8.531e-05 6.697e-05
## Cumulative Proportion 9.999e-01 1.000e+00
## 
## Scaling 2 for species and site scores
## * Species are scaled proportional to eigenvalues
## * Sites are unscaled: weighted dispersion equal on all dimensions
## * General scaling constant of scores:  14.14884 
## 
## 
## Species scores
## 
##              PC1     PC2      PC3      PC4      PC5       PC6
## DYRK1A_N -0.2295 -0.6686  0.06313 -0.37251  0.26677 -0.009456
## ITSN1_N  -0.4979 -0.7818  0.03282 -0.26909  0.34936  0.100461
## BDNF_N   -1.5676 -0.3441  0.06403 -0.44447  0.15261 -0.066180
## NR1_N    -1.5706 -0.2778  0.51765 -0.01343 -0.11566  0.075059
## NR2A_N   -1.3212 -0.3145  0.67291 -0.05494 -0.04272 -0.015097
## pAKT_N   -0.9396  0.8154 -0.50893 -0.42398 -0.21462  0.216800
## ....                                                         
## 
## 
## Site scores (weighted sums of species scores)
## 
##          PC1     PC2    PC3    PC4     PC5     PC6
## 76   -0.7937 -0.5367 0.4364 0.4141  0.2217 -0.8140
## 77   -0.7374 -0.4628 0.3746 0.5014  0.3151 -1.0306
## 78   -0.8675 -0.4535 0.3847 0.3733  0.4134 -0.9041
## 79   -0.2998 -0.5313 0.3439 0.1274 -0.4611 -0.5489
## 80   -0.3628 -0.4460 0.1840 0.2636 -0.2013 -0.8447
## 81   -0.3663 -0.4273 0.1751 0.3349 -0.1623 -0.7517
## ....

Plot factors’ biplot and total biplot

biplot(pca, scaling = "species", display = "species")

biplot(pca)

Plot explained by PCA proportions (just for first 10 axes)

pca_summary <- summary(pca)
pca_result <- as.data.frame(pca_summary$cont)
plot_data <- as.data.frame(t(as.matrix(pca_result[c("Proportion Explained"),])))
plot_data$component <- rownames(plot_data)
plot_data$component  <- factor(plot_data$component , levels = plot_data$component)

ggplot(plot_data[1:10,], aes(component, `Proportion Explained`)) + 
  geom_bar(stat = "identity") + 
  scale_x_discrete(guide = guide_axis(angle = 90))

Draw plots, and 3D plot for first 3 axes

df_scores <- data.frame(scores(pca, display = "sites", choices = c(1, 2, 3), scaling = "sites"), data %>% na.omit)

p_genotype <- ggplot(df_scores, aes(x = PC1, y = PC2)) + 
  geom_point(aes(color = Genotype), alpha = 0.5)
p_treatment <- ggplot(df_scores, aes(x = PC1, y = PC2)) + 
  geom_point(aes(color = Treatment), alpha = 0.5)
p_behavior <- ggplot(df_scores, aes(x = PC1, y = PC2)) + 
  geom_point(aes(color = Behavior), alpha = 0.5)

ggarrange(p_genotype, p_treatment, p_behavior, nrow = 2, ncol = 2)

plot_ly(x=df_scores$PC1, y=df_scores$PC2, z=df_scores$PC3, color = df_scores$class, colors = "Dark2", type="scatter3d", mode="markers", size = 1) %>%
  layout(title = "PCA 3d scatter plot",
  scene = list(
    xaxis = list(title = "PC1"),
    yaxis = list(title = "PC2"),
    zaxis = list(title = "PC3")
  ))
## Warning: `arrange_()` is deprecated as of dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.

6 Differential expression

DeSEQ2 works only with integer numbers, so the minimum value from dataset is 0.0525284, maximum is 8.4825534. If we multipe it to 20 and round, minimum will be around 1. (Actually, I am not sure, that this rough transformation doesn’t ruin all data…)

Let’s find out, where we can find significant difference in protein abundance according metadata

d_data <- data.frame(t(as.matrix(data %>% na.omit %>% keep(is.numeric)))) %>% 
  mutate(across(where(is.numeric), function(x){round(100*x)}))

d_metadata <- data %>% na.omit %>% keep(is.factor)
colnames(d_data) <- d_metadata$MouseID
rownames(d_data) <- colnames(proteins_scale)

d_data[1:10,1:10]
##           3415_1 3415_2 3415_3 3415_4 3415_5 3415_6 3415_7 3415_8 3415_9
## DYRK1A_N      65     62     64     58     54     57     49     49     51
## ITSN1_N       83     84     85     76     76     76     67     63     66
## BDNF_N        41     39     40     35     35     34     31     30     31
## NR1_N        292    286    297    262    263    260    244    232    242
## NR2A_N       517    519    535    473    474    476    426    415    434
## pAKT_N        21     22     21     21     21     20     23     20     21
## pBRAF_N       18     17     17     16     17     16     16     15     15
## pCAMKII_N    373    365    381    378    387    384    373    357    383
## pCREB_N       24     22     22     19     19     19     19     16     17
## pELK_N       167    157    174    151    153    158    131    129    143
##           3415_10
## DYRK1A_N       41
## ITSN1_N        54
## BDNF_N         27
## NR1_N         203
## NR2A_N        339
## pAKT_N         22
## pBRAF_N        16
## pCAMKII_N     307
## pCREB_N        15
## pELK_N        117
d_metadata[1:10,]
## # A tibble: 10 x 5
##    MouseID Genotype Treatment Behavior class 
##    <fct>   <fct>    <fct>     <fct>    <fct> 
##  1 3415_1  Control  Memantine C/S      c-CS-m
##  2 3415_2  Control  Memantine C/S      c-CS-m
##  3 3415_3  Control  Memantine C/S      c-CS-m
##  4 3415_4  Control  Memantine C/S      c-CS-m
##  5 3415_5  Control  Memantine C/S      c-CS-m
##  6 3415_6  Control  Memantine C/S      c-CS-m
##  7 3415_7  Control  Memantine C/S      c-CS-m
##  8 3415_8  Control  Memantine C/S      c-CS-m
##  9 3415_9  Control  Memantine C/S      c-CS-m
## 10 3415_10 Control  Memantine C/S      c-CS-m
significant_proteins <- function(model){
  dds <- DESeqDataSetFromMatrix(countData=d_data, 
                              colData=d_metadata, 
                              design=model)
  dds <- DESeq(dds)
  res <- data.frame(results(dds))
  rownames(res) <- rownames(d_data)

  res <- res[res$padj < 0.05,]
  rownames(res)
}

significant_proteins(~Behavior)
##  [1] "DYRK1A_N"        "ITSN1_N"         "BDNF_N"          "NR1_N"          
##  [5] "NR2A_N"          "pAKT_N"          "pBRAF_N"         "pCAMKII_N"      
##  [9] "pCREB_N"         "pELK_N"          "pERK_N"          "pJNK_N"         
## [13] "PKCA_N"          "pMEK_N"          "pNR2A_N"         "pPKCAB_N"       
## [17] "pRSK_N"          "AKT_N"           "BRAF_N"          "CAMKII_N"       
## [21] "CREB_N"          "ELK_N"           "ERK_N"           "GSK3B_N"        
## [25] "TRKA_N"          "APP_N"           "SOD1_N"          "MTOR_N"         
## [29] "P38_N"           "pMTOR_N"         "DSCR1_N"         "NR2B_N"         
## [33] "pNUMB_N"         "RAPTOR_N"        "pP70S6_N"        "NUMB_N"         
## [37] "pGSK3B_N"        "pPKCG_N"         "CDK5_N"          "S6_N"           
## [41] "ADARB1_N"        "ARC_N"           "ERBB4_N"         "nNOS_N"         
## [45] "Tau_N"           "GFAP_N"          "GluR3_N"         "IL1B_N"         
## [49] "pCASP9_N"        "PSD95_N"         "SNCA_N"          "Ubiquitin_N"    
## [53] "pGSK3B_Tyr216_N" "SHH_N"           "BAD_N"           "BCL2_N"         
## [57] "pS6_N"           "H3AcK18_N"       "EGR1_N"          "H3MeK4_N"       
## [61] "CaNA_N"
significant_proteins(~Treatment)
##  [1] "DYRK1A_N"     "ITSN1_N"      "NR1_N"        "pAKT_N"       "pCAMKII_N"   
##  [6] "pERK_N"       "pJNK_N"       "pNR2A_N"      "pPKCAB_N"     "pRSK_N"      
## [11] "AKT_N"        "BRAF_N"       "ELK_N"        "ERK_N"        "TRKA_N"      
## [16] "Bcatenin_N"   "SOD1_N"       "P38_N"        "pMTOR_N"      "DSCR1_N"     
## [21] "NR2B_N"       "pP70S6_N"     "NUMB_N"       "pGSK3B_N"     "pPKCG_N"     
## [26] "CDK5_N"       "ADARB1_N"     "AcetylH3K9_N" "BAX_N"        "NA"          
## [31] "NA.1"         "GluR3_N"      "NA.2"         "IL1B_N"       "P3525_N"     
## [36] "PSD95_N"      "Ubiquitin_N"  "NA.3"         "NA.4"         "SYP_N"       
## [41] "H3AcK18_N"    "EGR1_N"       "CaNA_N"
significant_proteins(~Genotype)
##  [1] "DYRK1A_N"        "ITSN1_N"         "NR1_N"           "NR2A_N"         
##  [5] "pCREB_N"         "pERK_N"          "pNR1_N"          "pNR2A_N"        
##  [9] "pNR2B_N"         "pRSK_N"          "AKT_N"           "BRAF_N"         
## [13] "GSK3B_N"         "APP_N"           "Bcatenin_N"      "SOD1_N"         
## [17] "MTOR_N"          "P38_N"           "pMTOR_N"         "AMPKA_N"        
## [21] "NR2B_N"          "RAPTOR_N"        "pP70S6_N"        "pPKCG_N"        
## [25] "S6_N"            "AcetylH3K9_N"    "ARC_N"           "Tau_N"          
## [29] "GluR3_N"         "IL1B_N"          "pCASP9_N"        "PSD95_N"        
## [33] "SNCA_N"          "pGSK3B_Tyr216_N" "pS6_N"           "pCFOS_N"        
## [37] "SYP_N"           "H3AcK18_N"       "EGR1_N"          "CaNA_N"